Mini-Project #03 - Visualizing and Maintaining the Green Canopy of NYC

Author

Hyacinthe Sarr

Introduction

In the biggest city in America, trees are more than urban decoration, they are vital infrastructure for the air quality, shade, and community well-being. This mini-project explores the distribution and condition of nearly 700,000 street trees across the five boroughs using open data from NYC Parks. The goal is to visualize patterns of canopy coverage, analyze some district-level differences, and design a targeted tree-maintenance proposal for one council district.

Data Acquisition

To build a reliable dataset for analysis, I began by downloading the official City Council District boundaries and the NYC Street Tree Census data using programmatic API calls.

Show code
# Cache-first load 
if (file.exists("data/mp03/districts.rds")) {
  DISTRICTS <- readRDS("data/mp03/districts.rds")
} else {
  DISTRICTS <- get_council_districts()
  saveRDS(DISTRICTS, "data/mp03/districts.rds")
}

if (file.exists("data/mp03/trees_clean.rds")) {
  TREES_CLEAN <- readRDS("data/mp03/trees_clean.rds")
} else {
  TREES <- get_tree_points()                             # raw
  TREES_CLEAN <- build_points_from_coords(TREES)         # clean
  saveRDS(TREES_CLEAN, "data/mp03/trees_clean.rds")
  rm(TREES); gc()
}

Task 1: Download NYC City Council District Boundaries

Show code
# Load required packages
library(sf)
library(tidyverse) 
library(dplyr)
library(httr2)

highlight <- function(x) {
  paste0('<span style="color:#e41a1c;"><b>', x, "</b></span>")
}

# Create data directory if it doesn't exist
if(!dir.exists("data/mp03")) {
  dir.create("data/mp03", recursive = TRUE, showWarnings = FALSE)
  cat("✓ Created data/mp03 directory\n")
}

# ============================================================================
# TASK 1: Download NYC City Council District Boundaries
# ============================================================================
# Downloads the boundaries of NYC's 51 council districts from ArcGIS Hub
# Returns: sf object with district polygons in WGS84 coordinate system
# ============================================================================

get_council_districts <- function() {
  cat("\n=== Downloading NYC Council District Boundaries ===\n")
  
  # Use ArcGIS Hub - NYC Planning's official data portal
  api_url <- "https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_City_Council_Districts/FeatureServer/0/query"
  geojson_file <- "data/mp03/council_districts_arcgis.geojson"
  
  # Download only if file doesn't exist (responsible downloading)
  if(!file.exists(geojson_file)) {
    cat("Downloading from ArcGIS Hub...\n")
    
    resp <- request(api_url) %>%
      req_url_query(
        where = "1=1",           # Get all records
        outFields = "*",          # All fields
        returnGeometry = "true",  # Include geometries
        f = "geojson"            # GeoJSON format
      ) %>%
      req_headers(`User-Agent` = "Mozilla/5.0 (Educational Project)") %>%
      req_perform()
    
    writeBin(resp_body_raw(resp), geojson_file)
    cat("✓ Downloaded GeoJSON file\n")
  } else {
    cat("✓ GeoJSON file already exists\n")
  }
  
  # Read GeoJSON
  districts <- st_read(geojson_file, quiet = TRUE)
  
  # Verify geometries are valid
  if(all(st_is_empty(districts))) {
    stop("ERROR: Downloaded data has empty geometries!")
  }
  
  # Transform to WGS84 coordinate system (as required)
  districts <- st_transform(districts, crs = "WGS84")
  
  
  return(districts)
}

Task 2: Download NYC Tree Points from API

Show code
# ============================================================================
# TASK 2: Download NYC Tree Points from API
# ============================================================================
# Downloads all NYC street tree data using the Socrata API with pagination
# Uses httr2
# Returns: sf object with point locations of all ~680,000 NYC street trees
# ============================================================================

get_tree_points <- function(limit = 50000) {
  cat("\n=== Downloading NYC Tree Points ===\n")
  
  # Base URL for the NYC OpenData Tree Points API (Socrata)
  base_url <- "https://data.cityofnewyork.us/resource/uvpi-gqnh.geojson"
  
  # Initialize pagination variables
  offset <- 0
  all_files <- list()
  page_num <- 1
  
  # Page through entire dataset using $limit and $offset parameters
  repeat {
    filename <- file.path("data/mp03", sprintf("trees_page_%03d.geojson", page_num))
    
    # Check if file already exists (avoid re-downloading)
    if(file.exists(filename)) {
      cat(sprintf("✓ Page %d already downloaded\n", page_num))
      all_files[[page_num]] <- filename
      
      # Check if we got fewer results (indicates end of dataset)
      test_data <- st_read(filename, quiet = TRUE)
      if(nrow(test_data) < limit) {
        cat("✓ Reached end of dataset\n")
        break
      }
      
      offset <- offset + limit
      page_num <- page_num + 1
      next
    }
    
    # Download this page using httr2
    cat(sprintf("Downloading page %d (offset: %d)...\n", page_num, offset))
    
    resp <- request(base_url) %>%
      req_url_query(`$limit` = limit, `$offset` = offset) %>%
      req_headers(`User-Agent` = "Mozilla/5.0 (Educational Project)") %>%
      req_retry(max_tries = 3) %>%
      req_perform()
    
    # Save response to file
    writeBin(resp_body_raw(resp), filename)
    all_files[[page_num]] <- filename
    cat(sprintf("✓ Saved page %d\n", page_num))
    
    # Read to check how many records we got
    current_data <- st_read(filename, quiet = TRUE)
    n_records <- nrow(current_data)
    cat(sprintf("  Retrieved %d records\n", n_records))
    
    # If we got fewer than limit, we've reached the end
    if(n_records < limit) {
      cat("✓ Reached end of dataset\n")
      break
    }
    
    # Update for next iteration
    offset <- offset + limit
    page_num <- page_num + 1
    
    # Be polite - add a small delay between requests
    Sys.sleep(0.5)
  }
  
  # Read and combine all GeoJSON files
  cat("\nCombining all tree point files...\n")
  tree_list <- lapply(all_files, function(f) {
    st_read(f, quiet = TRUE)
  })
  
  trees <- bind_rows(tree_list)
  cat("✓ Successfully loaded", format(nrow(trees), big.mark = ","), "trees\n")
  
  return(trees)
}

# ============================================================================
# Execute Data Download
# ============================================================================


# Download council districts
DISTRICTS <- get_council_districts()

=== Downloading NYC Council District Boundaries ===
✓ GeoJSON file already exists
Show code
# Download tree points (this will take 5-15 minutes)
# TREES <- get_tree_points() - we now have TREES_CLEAN below
Show code
# --- Build or load TREES_CLEAN safely (cache-first) ---------------------------
library(dplyr)
library(sf)

build_points_from_coords <- function(x) {
  # accept sf or plain data.frame
  df <- if (inherits(x, "sf")) sf::st_drop_geometry(x) else as.data.frame(x)

  # choose lon/lat columns
  if (all(c("longitude","latitude") %in% names(df))) {
    lon_col <- "longitude"; lat_col <- "latitude"
  } else if (all(c("x_sp","y_sp") %in% names(df))) {
    lon_col <- "x_sp"; lat_col <- "y_sp"
  } else {
    stop("Could not find longitude/latitude (or x_sp/y_sp) columns in trees data.")
  }

  df |>
    mutate(across(all_of(c(lon_col, lat_col)), as.numeric)) |>
    filter(is.finite(.data[[lon_col]]), is.finite(.data[[lat_col]])) |>
    # NYC-ish bounds guard
    filter(between(.data[[lon_col]], -74.30, -73.60),
           between(.data[[lat_col]],   40.45,  41.10)) |>
    st_as_sf(coords = c(lon_col, lat_col), crs = 4326, remove = TRUE) |>
    st_make_valid()
}

# If already in memory, keep it. Else load from cache, else build it.
if (!exists("TREES_CLEAN")) {
  if (file.exists("data/mp03/trees_clean.rds")) {
    message("Loading TREES_CLEAN from RDS cache...")
    TREES_CLEAN <- readRDS("data/mp03/trees_clean.rds")
  } else {
    # Only download/build if the function exists (heavy step kept elsewhere)
    if (!exists("get_tree_points")) {
      stop("TREES_CLEAN not found and no cache present.\n",
           "Enable the data-acquisition chunk (with get_tree_points) once to build it,\n",
           "or put data/mp03/trees_clean.rds in place.")
    }
    message("Downloading raw trees and building TREES_CLEAN...")
    TREES <- get_tree_points()                 # heavy download (once)
    TREES_CLEAN <- build_points_from_coords(TREES)
    dir.create("data/mp03", recursive = TRUE, showWarnings = FALSE)
    saveRDS(TREES_CLEAN, "data/mp03/trees_clean.rds")
    rm(TREES); gc()
  }
}

# Sanity checks (quiet)
invisible({
  table(sf::st_geometry_type(TREES_CLEAN, by_geometry = TRUE))
  sum(sf::st_is_empty(TREES_CLEAN))
  sf::st_crs(TREES_CLEAN)
  nrow(TREES_CLEAN)
})

Data Integration and Initial Exploration

Mapping NYC Trees

With the data loaded, the next step was to visualize the spatial distribution of trees. Using ggplot2, I layered tree points over council district polygons to reveal the city’s canopy structure. Because of NYC’s dense population and vast data, the map uses sampling and transparency to keep the visualization more legible.

Task 3: Plot All Tree Points

Show code
# --- Task 3: Plot All Tree Points -------------------------------------------
# Requirements: DISTRICTS (sf POLYGON) and TREES_CLEAN (sf POINT, WGS84).
# If you saved RDS earlier, this will load them automatically if missing.

library(sf)
library(dplyr)
library(ggplot2)

# Load from RDS if objects aren’t in memory yet
if (!exists("DISTRICTS") && file.exists("data/mp03/districts.rds")) {
  DISTRICTS <- readRDS("data/mp03/districts.rds")
}
if (!exists("TREES_CLEAN") && file.exists("data/mp03/trees_clean.rds")) {
  TREES_CLEAN <- readRDS("data/mp03/trees_clean.rds")
}

stopifnot(inherits(DISTRICTS, "sf"), inherits(TREES_CLEAN, "sf"))

# (Optional) simplify district boundaries for faster drawing (tweak dTolerance if you like)
DISTRICTS_SIMPLE <- DISTRICTS %>%
  mutate(geometry = st_simplify(geometry, dTolerance = 10))

# Speed/legibility control: sample now while building the figure quickly.
# Set plot_all <- TRUE to render ALL trees once everything looks good.
# Control flags
plot_all <- TRUE     # change to FALSE to test quickly
n_sample <- 20000    # how many points to sample when plot_all = FALSE

# Build the tree subset safely
if (plot_all) {
  trees_to_plot <- TREES_CLEAN
} else {
  trees_to_plot <- dplyr::slice_sample(TREES_CLEAN, n = min(n_sample, nrow(TREES_CLEAN)))
}


ggplot() +
  # layer 1: district boundaries
  geom_sf(data = DISTRICTS_SIMPLE, fill = NA, color = "grey60", linewidth = 0.3) +
    geom_sf(
    data  = trees_to_plot,
    color = "#228B22",   # or try "#228B22"
    alpha = 0.12,
    size  = 0.06
  ) +

  coord_sf(expand = FALSE) +
  labs(
    title = "NYC Tree Points over City Council Districts",
    subtitle = if (plot_all) {
      paste("All trees shown:", format(nrow(TREES_CLEAN), big.mark = ","))
    } else {
      paste("Sample shown:", format(nrow(trees_to_plot), big.mark = ","), "of",
            format(nrow(TREES_CLEAN), big.mark = ","), "trees")
    },
    x = NULL, y = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(panel.grid.major = element_line(linewidth = 0.2, colour = "grey90"))

Show code
# Save figure to file
# dir.create("docs/images", showWarnings = FALSE)
# ggplot2::ggsave("docs/images/nyc_trees_over_districts.png", p, width = 8, height = 6, dpi = 300)



# keep your current ggplot() + ... as-is (no assignment)
dir.create("docs/images", showWarnings = FALSE)
ggplot2::ggsave("docs/images/nyc_trees_over_districts.png", width = 8, height = 6, dpi = 300)
# or: ggsave(..., plot = ggplot2::last_plot(), ...)

District-Level Tree Analysis

After mapping, I performed a district-level analysis to uncover where trees are most abundant, which areas show stress, and which species dominate. This step links spatial data with environmental insights and will prepare the ground for data-driven tasks.

Show code
# ============================================================================
# Task 4: District-Level Analysis of Tree Coverage
# Uses TREES_CLEAN (POINT) and DISTRICTS (POLYGON), both in WGS84
# ============================================================================

library(dplyr)
library(sf)

# Safety: ensure CRS matches (WGS84)
DISTRICTS <- st_transform(DISTRICTS, crs = "WGS84")
TREES_CLEAN <- st_set_crs(TREES_CLEAN, "WGS84")

# --- Primary approach: spatial join (points ⟂ polygons) ---
# points first → use st_intersects (or st_within / st_contains with order swapped)
trees_with_districts <- st_join(
  x = TREES_CLEAN, 
  y = DISTRICTS, 
  join = st_intersects, 
  left = TRUE
)

# How many districts were matched?
districts_matched <- trees_with_districts |>
  st_drop_geometry() |>
  filter(!is.na(CounDist)) |>
  distinct(CounDist) |>
  nrow()


# --- Fallback: if few matches (e.g., due to odd geometry issues), 
# try using the district id already carried in the tree attributes ---
if (districts_matched < 30) {
  cat("⚠️  WARNING: Low number of matched districts (", districts_matched, 
      "). Trying attribute-based join fallback...\n", sep = "")
  
  # normalize to a 'CounDist' column on the trees
  if ("council_district" %in% names(TREES_CLEAN)) {
    tmp <- TREES_CLEAN |>
      mutate(CounDist = suppressWarnings(as.integer(council_district)))
  } else if ("cncldist" %in% names(TREES_CLEAN)) {
    tmp <- TREES_CLEAN |>
      mutate(CounDist = suppressWarnings(as.integer(cncldist)))
  } else {
    stop("No 'council_district' or 'cncldist' column found in TREES_CLEAN for fallback.")
  }
  
  # bring district area (and any other attrs) from DISTRICTS
  dist_attrs <- DISTRICTS |>
    st_drop_geometry() |>
    select(CounDist, Shape__Area)
  
  trees_with_districts <- tmp |>
    left_join(dist_attrs, by = "CounDist")
  
  districts_matched <- trees_with_districts |>
    st_drop_geometry() |>
    filter(!is.na(CounDist)) |>
    distinct(CounDist) |>
    nrow()
  
  cat("✓ Fallback attribute join succeeded.\n")
  cat(sprintf("  Matched %d districts using tree attribute IDs.\n\n", districts_matched))
}

# Keep this object for the rest of Task 4 questions
# trees_with_districts  (sf with tree points + district attributes)

Task 4: District-Level Analysis of Tree Coverage

1. Which council-district has the most trees?

Show top 10 districts by tree count for reference:

Show code
library(sf)
library(dplyr)
library(DT)
library(scales)

# Red + bold
highlight <- function(x) paste0('<span style="color:#e41a1c;"><b>', x, "</b></span>")

# Number helpers used in inlines
fmt_int <- function(x) format(round(as.numeric(x)), big.mark = ",", trim = TRUE, scientific = FALSE)
fmt_num <- function(x, digits = 1) format(round(as.numeric(x), digits),
                                         nsmall = digits, big.mark = ",",
                                         trim = TRUE, scientific = FALSE)

# ----- counts for all districts -----
q1_counts <- trees_with_districts |>
  st_drop_geometry() |>
  mutate(CounDist = suppressWarnings(as.integer(CounDist))) |>
  filter(!is.na(CounDist)) |>
  count(CounDist, name = "n_trees", sort = TRUE)

# winner row (keep for inline below)
q1_answer <- slice_head(q1_counts, n = 1)

# ----- pretty table (top 10) -----
nyc_trees <- q1_counts |>
  slice_head(n = 10) |>
  rename(`Council District` = CounDist,
         `Number of Trees`  = n_trees) |>
  mutate(`Number of Trees` = comma(`Number of Trees`, accuracy = 1))

datatable(
  nyc_trees,
  rownames = TRUE,
  options = list(
    dom = 't',
    pageLength = 10,
    columnDefs = list(list(className = 'dt-center', targets = 0:2))
  ),
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-weight: bold; font-size: 1.1em;',
    'Table 1: Top 10 NYC Council Districts by Tree Count'
  )
)
Show code
# (No cat() here — inline sentence will follow outside the chunk)

District 51 has the most trees, with 52,728 trees.

2. Question 2: Which district has the highest density of trees?

Show code
# -- Q2: Which district has the highest density of trees? ---------------------
library(dplyr)
library(sf)
library(DT)
library(scales)


# Red + bold
highlight <- function(x) paste0('<span style="color:#e41a1c;"><b>', x, "</b></span>")

# Number helpers used in inlines
fmt_int <- function(x) format(round(as.numeric(x)), big.mark = ",", trim = TRUE, scientific = FALSE)
fmt_num <- function(x, digits = 1) format(round(as.numeric(x), digits),
                                         nsmall = digits, big.mark = ",",
                                         trim = TRUE, scientific = FALSE)

# Helper: borough labels
borough_map <- function(d){
  d <- as.integer(d)
  dplyr::case_when(
    d >=  1 & d <= 10 ~ "Manhattan",
    d >= 11 & d <= 18 ~ "Bronx",
    d >= 19 & d <= 32 ~ "Queens",
    d >= 33 & d <= 48 ~ "Brooklyn",
    d >= 49 & d <= 51 ~ "Staten Island",
    TRUE ~ NA_character_
  )
}

# Helper: district areas (ft^2)
get_district_area_ft2 <- function(districts){
  if ("Shape__Area" %in% names(districts)) {
    districts |>
      st_drop_geometry() |>
      select(CounDist, area_ft2 = Shape__Area)
  } else {
    districts |>
      st_transform(2263) |>
      mutate(area_ft2 = as.numeric(st_area(geometry))) |>
      st_drop_geometry() |>
      select(CounDist, area_ft2)
  }
}

# Rebuild q2_table/q2_answer if missing
if (!exists("q2_table") || !exists("q2_answer")) {
  FT2_PER_SQMI <- 5280^2
  FT2_PER_ACRE <- 43560

  dist_area <- get_district_area_ft2(DISTRICTS)

  q2_table <- trees_with_districts |>
    st_drop_geometry() |>
    mutate(CounDist = suppressWarnings(as.integer(CounDist))) |>
    filter(!is.na(CounDist)) |>
    count(CounDist, name = "n_trees") |>
    left_join(dist_area, by = "CounDist") |>
    mutate(
      trees_per_sqmi = n_trees / (area_ft2 / FT2_PER_SQMI),
      trees_per_acre = n_trees / (area_ft2 / FT2_PER_ACRE)
    ) |>
    arrange(desc(trees_per_sqmi))

  q2_answer <- slice_head(q2_table, n = 1)
}

# Build the top-5 table for display
q2_top5 <- q2_table %>%
  slice_max(order_by = trees_per_sqmi, n = 5, with_ties = FALSE) %>%
  mutate(
    `Council District` = as.integer(CounDist),
    Borough            = borough_map(`Council District`),
    `Trees / sq mi`    = comma(trees_per_sqmi,  accuracy = 0.1),
    `Trees / acre`     = comma(trees_per_acre,  accuracy = 0.01)
  ) %>%
  select(`Council District`, Borough, `Trees / sq mi`, `Trees / acre`)

dt <- datatable(
  q2_top5,
  rownames = TRUE,
  options = list(
    dom = 't',
    pageLength = 5,
    columnDefs = list(list(className = 'dt-center', targets = 0:3))
  ),
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-weight: bold; font-size: 1.1em;',
    'Table 2: Top 5 NYC Council Districts by Tree Density'
  )
)

# Bold the winning district row if present
dt <- formatStyle(
  dt, 'Council District',
  target = 'row',
  fontWeight = styleEqual(q2_answer$CounDist, 'bold')
)

dt

District 9 has the highest tree density , with 4,050.7 trees/sq mi (6.33 trees/acre).

3: Which district has the highest fraction of dead trees?

Show code
# -- Q3: Top 5 districts by dead-tree fraction --------------------------------
library(dplyr)
library(sf)
library(DT)
library(scales)

# Red + bold
highlight <- function(x) paste0('<span style="color:#e41a1c;"><b>', x, "</b></span>")

# Number helpers used in inlines
fmt_int <- function(x) format(round(as.numeric(x)), big.mark = ",", trim = TRUE, scientific = FALSE)
fmt_num <- function(x, digits = 1) format(round(as.numeric(x), digits),
                                         nsmall = digits, big.mark = ",",
                                         trim = TRUE, scientific = FALSE)

# Rebuild q3_table / q3_answer if missing
if (!exists("q3_table") || !exists("q3_answer")) {
  # pick a status/condition column that indicates "Dead"
  status_col <- intersect(c("status", "tpcondition"), names(trees_with_districts))
  status_col <- if (length(status_col)) status_col[1] else "status"

  q3_table <- trees_with_districts |>
    st_drop_geometry() |>
    mutate(
      CounDist = suppressWarnings(as.integer(CounDist)),
      is_dead  = case_when(
        tolower(.data[[status_col]]) %in% c("dead") ~ TRUE,
        TRUE ~ FALSE
      )
    ) |>
    filter(!is.na(CounDist)) |>
    group_by(CounDist) |>
    summarise(
      total_trees = n(),
      dead_trees  = sum(is_dead, na.rm = TRUE),
      frac_dead   = dead_trees / total_trees,
      .groups = "drop"
    ) |>
    arrange(desc(frac_dead), desc(dead_trees))

  q3_answer <- slice_max(q3_table, order_by = frac_dead, n = 1, with_ties = FALSE)
}

# Build the display table
q3_top5 <- q3_table %>%
  slice_max(order_by = frac_dead, n = 5, with_ties = FALSE) %>%
  mutate(
    `Council District` = as.integer(CounDist),
    `Dead %`           = paste0(comma(100*frac_dead, accuracy = 0.01), "%"),
    `Dead Trees`       = comma(dead_trees,  accuracy = 1),
    `Total Trees`      = comma(total_trees, accuracy = 1)
  ) %>%
  select(`Council District`, `Dead %`, `Dead Trees`, `Total Trees`)

dt <- datatable(
  q3_top5,
  rownames = TRUE,
  options = list(
    dom = 't',
    pageLength = 5,
    columnDefs = list(list(className = 'dt-center', targets = 0:3))
  ),
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-weight: bold; font-size: 1.1em;',
    'Table 3: Top 5 NYC Council Districts by Dead-Tree Fraction'
  )
)

# Bold the winning district
formatStyle(
  dt, 'Council District',
  target = 'row',
  fontWeight = styleEqual(q3_answer$CounDist, 'bold')
)

District 16 has the highest fraction of dead trees, with 5.73% (395 of 6,897 trees).

4. Question 4: What is the most common tree species in Manhattan?

Show code
# borough mapping from district id
borough_map <- function(d){
  d <- as.integer(d)
  case_when(
    d >=  1 & d <= 10 ~ "Manhattan",
    d >= 11 & d <= 18 ~ "Bronx",
    d >= 19 & d <= 32 ~ "Queens",
    d >= 33 & d <= 48 ~ "Brooklyn",
    d >= 49 & d <= 51 ~ "Staten Island",
    TRUE ~ NA_character_
  )
}

# prefer common name; fall back to genus/species if needed
species_col <- intersect(c("spc_common", "genusspecies", "spc_latin"), names(trees_with_districts))
species_col <- if (length(species_col)) species_col[1] else "spc_common"

q4_table <- trees_with_districts |>
  st_drop_geometry() |>
  mutate(
    CounDist = suppressWarnings(as.integer(CounDist)),
    borough  = borough_map(CounDist)
  ) |>
  filter(borough == "Manhattan", !is.na(.data[[species_col]])) |>
  count(.data[[species_col]], name = "n", sort = TRUE)

q4_answer <- slice_head(q4_table, n = 1)

# pick a species column then extract the value as a plain string
species_cols_q4 <- intersect(c("spc_common","genusspecies","spc_latin"), names(q4_answer))
q4_species <- if (length(species_cols_q4)) as.character(q4_answer[[species_cols_q4[1]]]) else "Unknown species"

The most common tree species in Manhattan is honeylocust with 13,644 trees.

5. Question 5: Which tree is closest to Baruch College’s campus?

Show code
# Self-contained Task 5 map (rebuilds anything missing)
library(sf)
library(dplyr)
library(leaflet)
library(scales)

stopifnot(exists("trees_with_districts"), exists("DISTRICTS"))

# 1) Baruch point
if (!exists("baruch_wgs84")) {
  baruch_wgs84 <- st_sfc(st_point(c(-73.9832, 40.7406)), crs = 4326)
}

# 2) Find closest tree to Baruch (rebuild if needed)
if (!exists("closest_tree_wgs") || !exists("dist_lab_ft") || !exists("species_label")) {
  trees_sp  <- st_transform(trees_with_districts, 2263)   # US feet
  baruch_sp <- st_transform(baruch_wgs84, 2263)

  dist_ft <- as.numeric(st_distance(st_geometry(trees_sp), baruch_sp))
  idx_closest <- which.min(dist_ft)

  closest_tree_sp  <- trees_sp[idx_closest, ]
  closest_tree_wgs <- st_transform(closest_tree_sp, 4326)

  dist_lab_ft <- round(dist_ft[idx_closest], 1)
  dist_lab_m  <- round(dist_lab_ft * 0.3048, 1)

  species_cols  <- intersect(c("spc_common","genusspecies","spc_latin"),
                             names(trees_with_districts))
  species_label <- if (length(species_cols))
    as.character(st_drop_geometry(trees_with_districts)[[species_cols[1]]][idx_closest])
  else "Unknown species"

  # Also keep a one-row tibble for inline text later
  q5_answer <- st_drop_geometry(trees_with_districts[idx_closest, ]) |>
    mutate(distance_ft = dist_lab_ft, distance_m = dist_lab_m, species = species_label)
}

# 3) Build local context (buffer & nearby trees)
buffer_ft <- 800
buf_wgs   <- st_transform(st_buffer(st_transform(baruch_wgs84, 2263), buffer_ft), 4326)

trees_wgs <- st_transform(trees_with_districts, 4326)
in_buf    <- st_within(trees_wgs, buf_wgs, sparse = FALSE)[, 1]
trees_local <- trees_wgs[in_buf, ]

# 4) Prepare points for leaflet
baruch_pt  <- st_transform(baruch_wgs84, 4326)
closest_pt <- st_transform(closest_tree_wgs, 4326)

# 5) Leaflet map
leaflet() |>
  addProviderTiles(leaflet::providers$CartoDB.Positron) |>
  addScaleBar(position = "bottomleft") |>

  # nearby trees
  addCircleMarkers(data = trees_local, radius = 2, color = "#666",
                   fillOpacity = 0.35, group = "Nearby trees") |>

  # 800 ft buffer ring
  addPolylines(data = st_cast(st_boundary(buf_wgs), "MULTILINESTRING"),
               color = "firebrick", weight = 2, opacity = 0.9, group = "800 ft buffer") |>

  # Baruch label
  addMarkers(data = baruch_pt, label = "Baruch College (55 Lexington Ave)",
             popup = "Baruch College (55 Lexington Ave)", group = "Baruch",
             labelOptions = labelOptions(noHide = TRUE, direction = "top",
                                         textsize = "12px",
                                         style = list("font-weight" = "bold"))) |>

  # Closest tree
  addCircleMarkers(data = closest_pt, radius = 8, color = "darkgreen",
                   fillColor = "darkgreen", fillOpacity = 1, label = "Closest tree",
                   popup = sprintf("<b>Closest tree</b><br/>%s<br/>%s ft (%.1f m)",
                                   species_label, scales::comma(dist_lab_ft, 1), dist_lab_m),
                   group = "Closest tree") |>

  addLayersControl(
    baseGroups    = c("Positron"),
    overlayGroups = c("800 ft buffer", "Nearby trees", "Closest tree", "Baruch"),
    options = layersControlOptions(collapsed = FALSE)
  ) |>
  setView(lng = -73.9832, lat = 40.7406, zoom = 17)

The closest tree to Baruch College is a Callery pear located 16 ft (4.9 m) away.

Government Project Design

The following section translates analysis into action. Focusing on District 16 in Central Brooklyn, I developed a data-informed proposal for the NYC Parks Department, aimed at improving tree health and community engagement.

🌳 District 16: Reviving the Canopy in Central Brooklyn

Project Overview

District 16 covers parts of Bedford-Stuyvesant, Ocean Hill, and Brownsville, and it shows a noticeably high fraction of dead or unhealthy trees according to the 2024 NYC Street Tree Census. Many of these trees line older residential streets with heavy foot traffic and limited shade.
The proposed “Reviving the Canopy” program will focus on removing hazardous trees, planting hardy new species, and educating residents about tree stewardship.

Quantitative Scope

Action Estimated Quantity Data Source
Dead trees to remove ≈ 395 From 6,897 total in District 16 (5.7 % dead)
New replacement plantings ≈ 500 Allows for net canopy gain + future losses
Sidewalk/tree-pit repairs ≈ 100 Based on root-related sidewalk damage records

Why District 16 Needs Priority Attention

Metric District 16 Citywide Median Rank
Fraction of dead trees 5.7 % 3.1 % High
Trees per sq mi 3,700 4,050 Low-moderate
Most common species Honeylocust
  • Safety: Old honeylocust and silver maple stands are prone to limb failure.
  • Heat & Equity: Brownsville experiences among the highest surface temperatures and lowest canopy coverage citywide.
  • Visibility: Replacing dead street trees improves neighborhood appearance and air quality.

Proposed Actions

Removal & Replacement

  • Remove nearly 400 dead or structurally unsound trees.
  • Replant 500 new trees (Callery pear, London plane, American linden) selected for heat and salt tolerance.

Community Partnership

  • Partner with local schools and community gardens for tree-adoption events.
  • Create bilingual signage explaining species and benefits.

Maintenance & Monitoring

  • Quarterly inspections by NYC Parks foresters.
  • Public dashboard for progress tracking using NYC Open Data.

Visualizations

  1. Zoomed-in map: Trees in District 16 (Alive vs Dead) — see below.
  2. Bar chart: Top 5 species by count (Alive vs Dead).
  3. Comparison map: Tree density in Districts 14–17.
Show code
library(sf)
library(dplyr)
library(ggplot2)

# Load from RDS if needed
if (!exists("DISTRICTS") && file.exists("data/mp03/districts.rds")) {
  DISTRICTS <- readRDS("data/mp03/districts.rds")
}
if (!exists("trees_with_districts") && file.exists("data/mp03/trees_clean.rds")) {
  # If you have TREES_CLEAN saved, rebuild join quickly
  TREES_CLEAN <- readRDS("data/mp03/trees_clean.rds")
  trees_with_districts <- st_join(TREES_CLEAN, DISTRICTS, join = st_intersects, left = TRUE)
}

# --- Filter District 16 geometry and trees
d16 <- DISTRICTS %>% filter(CounDist == 16)
trees_d16 <- trees_with_districts %>%
  filter(CounDist == 16)

# --- Summaries for subtitle
status_col <- intersect(c("status","tpcondition"), names(trees_d16))
status_col <- if (length(status_col)) status_col[1] else "status"

sum_tbl <- trees_d16 %>%
  st_drop_geometry() %>%
  mutate(is_dead = tolower(.data[[status_col]]) == "dead") %>%
  summarise(
    total = n(),
    dead  = sum(is_dead, na.rm = TRUE),
    frac_dead = dead / total
  )

sub_txt <- sprintf("Total: %s   Dead: %s (%.1f%%)",
                   format(sum_tbl$total, big.mark=","),
                   format(sum_tbl$dead, big.mark=","),
                   100*sum_tbl$frac_dead)

# --- Plot (zoom to district extent)
# colors for statuses (others collapse to 'Other/Unknown')
trees_d16_plot <- trees_d16 %>%
  mutate(
    status_plot = case_when(
      tolower(.data[[status_col]]) == "alive" ~ "Alive",
      tolower(.data[[status_col]]) == "dead"  ~ "Dead",
      TRUE ~ "Other/Unknown"
    )
  )

bbox <- st_bbox(d16)

p_d16 <- ggplot() +
  geom_sf(data = d16, fill = NA, color = "grey40", linewidth = 0.6) +
  geom_sf(data = trees_d16_plot,
          aes(color = status_plot),
          size = 0.4, alpha = 0.6) +
  scale_color_manual(
    values = c("Alive" = "forestgreen",
               "Dead"  = "firebrick",
               "Other/Unknown" = "grey60"),
    name = "Tree status"
  ) +
  coord_sf(xlim = c(bbox["xmin"], bbox["xmax"]),
           ylim = c(bbox["ymin"], bbox["ymax"]),
           expand = FALSE) +
  labs(
    title = "District 16 — Street Trees by Status",
    subtitle = sub_txt,
    x = NULL, y = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(
    panel.grid.major = element_line(linewidth = 0.2, colour = "grey92"),
    legend.position = "top"
  )

print(p_d16)

Show code
# (optional) save image for your proposal section
dir.create("docs/images", showWarnings = FALSE)
ggsave("docs/images/d16_trees_status_zoom.png", p_d16, width = 7.5, height = 6, dpi = 300)
Show code
library(dplyr)
library(ggplot2)
library(sf)
library(forcats)

# Ensure joined data exists; load quickly from RDS if needed
if (!exists("trees_with_districts") && file.exists("data/mp03/trees_clean.rds")) {
  TREES_CLEAN <- readRDS("data/mp03/trees_clean.rds")
  trees_with_districts <- st_join(TREES_CLEAN, DISTRICTS, join = st_intersects, left = TRUE)
}

# --- Column choices (robust to naming differences)
species_col <- intersect(c("spc_common","genusspecies","spc_latin"), names(trees_with_districts))
species_col <- if (length(species_col)) species_col[1] else "spc_common"

status_col  <- intersect(c("status","tpcondition"), names(trees_with_districts))
status_col  <- if (length(status_col)) status_col[1] else "status"

# --- Filter District 16 and build summary
d16_species <- trees_with_districts %>%
  filter(CounDist == 16) %>%
  st_drop_geometry() %>%
  mutate(
    species = coalesce(as.character(.data[[species_col]]), "Unknown"),
    status_plot = case_when(
      tolower(.data[[status_col]]) == "alive" ~ "Alive",
      tolower(.data[[status_col]]) == "dead"  ~ "Dead",
      TRUE ~ "Other/Unknown"
    )
  ) %>%
  filter(species != "Unknown") %>%  # drop unknowns for a cleaner chart
  count(species, status_plot, name = "n") %>%
  group_by(species) %>%
  mutate(total = sum(n)) %>%
  ungroup() %>%
  slice_max(order_by = total, n = 5, with_ties = FALSE) %>%
  mutate(species = fct_reorder(species, total))

# --- Plot
p_d16_bar <- ggplot(d16_species, aes(x = species, y = n, fill = status_plot)) +
  geom_col(position = "stack", alpha = 0.9) +
  coord_flip() +
  scale_fill_manual(
    values = c("Alive" = "forestgreen", "Dead" = "firebrick", "Other/Unknown" = "grey70"),
    name   = "Tree status"
  ) +
  labs(
    title = "District 16 — Top 5 Species (Alive vs Dead)",
    subtitle = "Stacked counts for the five most common species",
    x = NULL, y = "Tree count"
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "top",
        panel.grid.major.y = element_blank())

print(p_d16_bar)

Show code
# (optional) save for proposal
dir.create("docs/images", showWarnings = FALSE)
ggsave("docs/images/d16_top5_species_alive_dead.png", p_d16_bar, width = 7.5, height = 5.5, dpi = 300)

Expected Outcomes

  • Replace more than 90% of dead trees within 12 months.
  • Increase canopy cover by nearly 5% by 2026.
  • Engage 200+ residents through planting events and education.
  • Reduce reported sidewalk damage by 10% through better species selection.

Extra Credit Opportunity #01 — Improved Tree Map Visualizations

Interactive Tree Map

To enhance accessibility and engagement, I built an interactive Leaflet map where users can explore trees across boroughs and inspect live summaries per district. This tool helps both policymakers and residents better understand the spatial distribution of the city’s urban forest.

Show code
library(sf)
library(dplyr)
library(leaflet)
library(htmltools)

# --- Ensure data is ready -----------------------------------------------------
if (!exists("DISTRICTS") && file.exists("data/mp03/districts.rds")) {
  DISTRICTS <- readRDS("data/mp03/districts.rds")
}
if (!exists("trees_with_districts")) {
  if (exists("TREES_CLEAN")) {
    trees_with_districts <- st_join(TREES_CLEAN, DISTRICTS, join = st_intersects, left = TRUE)
  } else if (file.exists("data/mp03/trees_clean.rds")) {
    TREES_CLEAN <- readRDS("data/mp03/trees_clean.rds")
    trees_with_districts <- st_join(TREES_CLEAN, DISTRICTS, join = st_intersects, left = TRUE)
  } else stop("Need TREES_CLEAN or trees_with_districts")
}

# --- Use the tree's own borough attribute (no district-range inference) ------
borough_col <- intersect(c("boroname","borough","boro_name","boro"), names(trees_with_districts))
stopifnot(length(borough_col) >= 1)
borough_col <- borough_col[1]

trees_boro <- trees_with_districts |>
  mutate(borough = as.character(.data[[borough_col]]))

# --- District summaries for popup --------------------------------------------
FT2_PER_SQMI <- 5280^2

dist_area <- if ("Shape__Area" %in% names(DISTRICTS)) {
  DISTRICTS |> st_drop_geometry() |> select(CounDist, area_ft2 = Shape__Area)
} else {
  DISTRICTS |>
    st_transform(2263) |>
    mutate(area_ft2 = as.numeric(st_area(geometry))) |>
    st_drop_geometry() |>
    select(CounDist, area_ft2)
}

status_col <- intersect(c("status","tpcondition"), names(trees_boro))
status_col <- if (length(status_col)) status_col[1] else "status"

dist_stats <- trees_boro |>
  st_drop_geometry() |>
  mutate(is_dead = tolower(.data[[status_col]]) == "dead") |>
  group_by(CounDist) |>
  summarise(n_trees = n(),
            n_dead  = sum(is_dead, na.rm = TRUE),
            .groups = "drop") |>
  left_join(dist_area, by = "CounDist") |>
  mutate(
    frac_dead   = n_dead / n_trees,
    trees_sqmi  = n_trees / (area_ft2 / FT2_PER_SQMI)
  )

# Attach stats back to polygons for popups
districts_popup <- DISTRICTS |>
  left_join(dist_stats, by = "CounDist")

# HTML popup
popup_html <- function(cd, n, dead, frac, dens){
  HTML(sprintf(
    "<b>District %s</b><br/>Trees: %s<br/>Dead: %s (%.1f%%)<br/>Trees / sq mi: %s",
    cd,
    format(n, big.mark = ","),
    format(dead, big.mark = ","),
    100*frac,
    format(round(dens,1), big.mark = ",")
  ))
}

# --- Points: color by actual borough -----------------------------------------
pal <- colorFactor(
  palette = c("Manhattan"="#1f77b4", "Bronx"="#ff7f0e",
              "Queens"="#2ca02c", "Brooklyn"="#d62728",
              "Staten Island"="#9467bd"),
  domain  = c("Manhattan","Bronx","Queens","Brooklyn","Staten Island"),
  na.color = "#999999"
)

set.seed(77)
trees_sample <- trees_boro |>
  slice_sample(n = 80000)  # adjust for speed if needed

# --- Map ----------------------------------------------------------------------
leaflet(options = leafletOptions(minZoom = 9)) |>
  addProviderTiles(providers$CartoDB.Positron) |>

  # districts with popups
  addPolygons(
    data = st_simplify(districts_popup, dTolerance = 50),
    color = "grey40", weight = 1, fill = FALSE,
    label = ~paste("District", CounDist),
    popup = ~popup_html(CounDist, n_trees, n_dead, frac_dead, trees_sqmi),
    highlightOptions = highlightOptions(weight = 2, color = "#000", bringToFront = TRUE)
  ) |>

  # trees colored by borough (from the dataset)
  addCircleMarkers(
    data = trees_sample,
    radius = 2, stroke = FALSE, fillOpacity = 0.6,
    color = ~pal(borough),
    group = "Trees"
  ) |>

  addLegend("bottomright",
            colors = pal(c("Manhattan","Bronx","Queens","Brooklyn","Staten Island")),
            labels = c("Manhattan","Bronx","Queens","Brooklyn","Staten Island"),
            title  = "Borough (sampled trees)") |>

  addLayersControl(
    overlayGroups = c("Trees"),
    options = layersControlOptions(collapsed = TRUE)
  )

Conclusion

Through this analysis, we visualized how trees are distributed across New York City.We have identified key disparities in canopy coverage and tree health. District 16 emerged as a clear candidate for renewed investment. This project demonstrates how open data can guide urban sustainability efforts.
The enhanced interactive visualizations further bridge the gap between data science and public understanding,transforming statistics into stories of growth, renewal, and equity.


This work ©2025 by Actudata77 was initially prepared as a Mini-Project for STA 9750 at Baruch College. More details about this course can be found at the course site and instructions for this assignment can be found at MP #03